suppressPackageStartupMessages(library(tidyverse))
devtools::load_all('~/Google Drive/My Drive/Scripts/R_packages/myUtilities/')
## ℹ Loading myUtilities
wd <- "~/Google Drive/My Drive/Analysis/METTL2A/"
setwd(wd)


tabledir <- paste0(wd, 'Tables/DRS_m3C_sites/')
figdir  <- paste0(wd, 'Figures/DRS_m3C_sites/Chrs/')

read_methylated_position_tsv <- function(path) {
  
  read_tsv(
    path, col_names = c('position', 'kmer'), 
    show_col_types = F
  ) |> 
    separate(position, into = c('transcript_id', 'position'), sep = '[|]')
  
}

theme_set(
  theme_classic(base_size = 7) +
    theme(legend.position = 'bottom')
)

Functions

calc_percentage <- function(df) {
  
  df |> 
    reframe(n = n()) |> 
    mutate(percentage = 100 * n / sum(n)) |> 
    arrange(-percentage)
  
}

donutplot_genetype <- function(df) {
  
  df |> 
    add_yrange() |> 
    ggplot(aes(
      xmin = 2, xmax = 4, ymin = ymin, ymax = ymax,
      fill = genetype2, colour = genetype2         
    )) +
    geom_rect() +
    coord_polar(theta = 'y') +  
    ggrepel::geom_text_repel(
      aes(label = genetype2, y = (ymin + ymax) / 2), x = 1
    ) +
    xlim(c(-1,4)) +
    scale_fill_manual(values = c('#0099ff', '#ff9900', '#ff0099', '#9900ff')) +
    scale_color_manual(values = c('#0099ff', '#ff9900', '#ff0099', '#9900ff')) +
    theme_void() 
  
}

#’ # Read methylated position information and add annotation

espresso_annotation <-
  read_tsv(paste0(wd, 'Tables/Espresso_AsPC1_annotation_cleaned_2024-03-29.tsv'))
## Rows: 36717 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (12): seqname, source, feature, score, strand, frame, gene_id, transcrip...
## dbl  (2): start, end
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
espresso_annotation
## # A tibble: 36,717 × 14
##    seqname source feature  start    end score strand frame gene_id transcript_id
##    <chr>   <chr>  <chr>    <dbl>  <dbl> <chr> <chr>  <chr> <chr>   <chr>        
##  1 chr3    annot… transc… 3.15e6 3.15e6 .     -      .     ENSG00… ENST00000498…
##  2 chr3    annot… transc… 3.15e6 3.15e6 .     -      .     ENSG00… ENST00000459…
##  3 chr3    annot… transc… 3.15e6 3.18e6 .     -      .     ENSG00… ENST00000231…
##  4 chr3    annot… transc… 3.15e6 3.18e6 .     -      .     ENSG00… ENST00000432…
##  5 chr3    annot… transc… 3.13e6 3.13e6 .     +      .     ENSG00… ENST00000339…
##  6 chr3    annot… transc… 3.15e6 3.16e6 .     -      .     ENSG00… ENST00000488…
##  7 chr3    annot… transc… 3.13e6 3.13e6 .     +      .     ENSG00… ENST00000420…
##  8 chr3    annot… transc… 3.14e6 3.15e6 .     +      .     ENSG00… ENST00000698…
##  9 chr3    annot… transc… 3.17e6 3.18e6 .     -      .     ENSG00… ENST00000450…
## 10 chr3    annot… transc… 3.15e6 3.15e6 .     +      .     ENSG00… ENST00000698…
## # ℹ 36,707 more rows
## # ℹ 4 more variables: gene_type <chr>, gene_name <chr>, transcript_type <chr>,
## #   transcript_name <chr>
methylated_positions <- 
  read_tsv(
    paste0(wd, 'Tables/DRS/Positions/intensityup_common_2024-04-10.tsv.gz')
  ) |> 
  filter(middle_isC == 'C')
## Rows: 605 Columns: 65
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (30): transcript_id, transcript_name, ref_kmer, GMM_cov_type_G, cluster_...
## dbl (35): position, GMM_logit_pvalue_G, KS_dwell_pvalue_G, KS_intensity_pval...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
methylated_positions 
## # A tibble: 489 × 65
##    transcript_id     transcript_name position ref_kmer GMM_logit_pvalue_G
##    <chr>             <chr>              <dbl> <chr>                 <dbl>
##  1 ENST00000429711.7 RPL32-204            422 GCCCA                 1    
##  2 ENST00000647248.2 RPL35A-211           380 ACCCC                 1    
##  3 ENST00000647248.2 RPL35A-211           381 CCCCT                 1    
##  4 ENST00000389680.2 MT-RNR1-201           57 CCCCG                 1    
##  5 ENST00000389680.2 MT-RNR1-201           75 ACCCT                 0.777
##  6 ENST00000389680.2 MT-RNR1-201           93 ATCAA                 1    
##  7 ENST00000389680.2 MT-RNR1-201          148 GCCAC                 1    
##  8 ENST00000389680.2 MT-RNR1-201          153 ACCCC                 1    
##  9 ENST00000389680.2 MT-RNR1-201          154 CCCCC                 1    
## 10 ENST00000389680.2 MT-RNR1-201          155 CCCCA                 1    
## # ℹ 479 more rows
## # ℹ 60 more variables: KS_dwell_pvalue_G <dbl>, KS_intensity_pvalue_G <dbl>,
## #   GMM_cov_type_G <chr>, GMM_n_clust_G <dbl>, cluster_counts_G <chr>,
## #   Logit_LOR_G <dbl>, c1_mean_intensity_G <dbl>, c2_mean_intensity_G <dbl>,
## #   c1_median_intensity_G <dbl>, c2_median_intensity_G <dbl>,
## #   c1_sd_intensity_G <dbl>, c2_sd_intensity_G <dbl>, c1_mean_dwell_G <dbl>,
## #   c2_mean_dwell_G <dbl>, c1_median_dwell_G <dbl>, c2_median_dwell_G <dbl>, …

Presence of C in kmer

methylated_positions |> 
  group_by(grepl('.{2}C.{2}', ref_kmer)) |> 
  calc_percentage()
## # A tibble: 1 × 3
##   `grepl(".{2}C.{2}", ref_kmer)`     n percentage
##   <lgl>                          <int>      <dbl>
## 1 TRUE                             489        100
methylated_positions |> 
  filter(grepl('.{2}C.{2}', ref_kmer)) |> 
  group_by(seqname) |> 
  calc_percentage()
## # A tibble: 24 × 3
##    seqname     n percentage
##    <chr>   <int>      <dbl>
##  1 chrM      229      46.8 
##  2 chr12      47       9.61
##  3 chr16      41       8.38
##  4 chr1       34       6.95
##  5 chr11      28       5.73
##  6 chr2       19       3.89
##  7 chr7       16       3.27
##  8 chr19      14       2.86
##  9 chr5        9       1.84
## 10 chr15       6       1.23
## # ℹ 14 more rows
methylated_positions |> 
  filter(!grepl('.{2}C.{2}', ref_kmer)) |> 
  group_by(seqname) |> 
  calc_percentage()
## # A tibble: 0 × 3
## # ℹ 3 variables: seqname <chr>, n <int>, percentage <dbl>
methylated_positions |> 
  group_by(grepl('C', ref_kmer)) |> 
  calc_percentage()
## # A tibble: 1 × 3
##   `grepl("C", ref_kmer)`     n percentage
##   <lgl>                  <int>      <dbl>
## 1 TRUE                     489        100
methylated_positions |> 
  filter(grepl('C', ref_kmer)) |> 
  group_by(seqname) |> 
  calc_percentage()
## # A tibble: 24 × 3
##    seqname     n percentage
##    <chr>   <int>      <dbl>
##  1 chrM      229      46.8 
##  2 chr12      47       9.61
##  3 chr16      41       8.38
##  4 chr1       34       6.95
##  5 chr11      28       5.73
##  6 chr2       19       3.89
##  7 chr7       16       3.27
##  8 chr19      14       2.86
##  9 chr5        9       1.84
## 10 chr15       6       1.23
## # ℹ 14 more rows
methylated_positions_center_C <- 
  methylated_positions |> 
  filter(grepl('.{2}C.{2}', ref_kmer)) 
methylated_positions_center_C 
## # A tibble: 489 × 65
##    transcript_id     transcript_name position ref_kmer GMM_logit_pvalue_G
##    <chr>             <chr>              <dbl> <chr>                 <dbl>
##  1 ENST00000429711.7 RPL32-204            422 GCCCA                 1    
##  2 ENST00000647248.2 RPL35A-211           380 ACCCC                 1    
##  3 ENST00000647248.2 RPL35A-211           381 CCCCT                 1    
##  4 ENST00000389680.2 MT-RNR1-201           57 CCCCG                 1    
##  5 ENST00000389680.2 MT-RNR1-201           75 ACCCT                 0.777
##  6 ENST00000389680.2 MT-RNR1-201           93 ATCAA                 1    
##  7 ENST00000389680.2 MT-RNR1-201          148 GCCAC                 1    
##  8 ENST00000389680.2 MT-RNR1-201          153 ACCCC                 1    
##  9 ENST00000389680.2 MT-RNR1-201          154 CCCCC                 1    
## 10 ENST00000389680.2 MT-RNR1-201          155 CCCCA                 1    
## # ℹ 479 more rows
## # ℹ 60 more variables: KS_dwell_pvalue_G <dbl>, KS_intensity_pvalue_G <dbl>,
## #   GMM_cov_type_G <chr>, GMM_n_clust_G <dbl>, cluster_counts_G <chr>,
## #   Logit_LOR_G <dbl>, c1_mean_intensity_G <dbl>, c2_mean_intensity_G <dbl>,
## #   c1_median_intensity_G <dbl>, c2_median_intensity_G <dbl>,
## #   c1_sd_intensity_G <dbl>, c2_sd_intensity_G <dbl>, c1_mean_dwell_G <dbl>,
## #   c2_mean_dwell_G <dbl>, c1_median_dwell_G <dbl>, c2_median_dwell_G <dbl>, …
methylated_positions_C <- 
  methylated_positions |> 
  group_by(grepl('C', ref_kmer)) 
methylated_positions_C 
## # A tibble: 489 × 66
## # Groups:   grepl("C", ref_kmer) [1]
##    transcript_id     transcript_name position ref_kmer GMM_logit_pvalue_G
##    <chr>             <chr>              <dbl> <chr>                 <dbl>
##  1 ENST00000429711.7 RPL32-204            422 GCCCA                 1    
##  2 ENST00000647248.2 RPL35A-211           380 ACCCC                 1    
##  3 ENST00000647248.2 RPL35A-211           381 CCCCT                 1    
##  4 ENST00000389680.2 MT-RNR1-201           57 CCCCG                 1    
##  5 ENST00000389680.2 MT-RNR1-201           75 ACCCT                 0.777
##  6 ENST00000389680.2 MT-RNR1-201           93 ATCAA                 1    
##  7 ENST00000389680.2 MT-RNR1-201          148 GCCAC                 1    
##  8 ENST00000389680.2 MT-RNR1-201          153 ACCCC                 1    
##  9 ENST00000389680.2 MT-RNR1-201          154 CCCCC                 1    
## 10 ENST00000389680.2 MT-RNR1-201          155 CCCCA                 1    
## # ℹ 479 more rows
## # ℹ 61 more variables: KS_dwell_pvalue_G <dbl>, KS_intensity_pvalue_G <dbl>,
## #   GMM_cov_type_G <chr>, GMM_n_clust_G <dbl>, cluster_counts_G <chr>,
## #   Logit_LOR_G <dbl>, c1_mean_intensity_G <dbl>, c2_mean_intensity_G <dbl>,
## #   c1_median_intensity_G <dbl>, c2_median_intensity_G <dbl>,
## #   c1_sd_intensity_G <dbl>, c2_sd_intensity_G <dbl>, c1_mean_dwell_G <dbl>,
## #   c2_mean_dwell_G <dbl>, c1_median_dwell_G <dbl>, c2_median_dwell_G <dbl>, …

Distribution among chromosomes

methylated_positions_groupedby_chr <- 
  methylated_positions |> 
  group_by(seqname) |> 
  calc_percentage()
methylated_positions_groupedby_chr
## # A tibble: 24 × 3
##    seqname     n percentage
##    <chr>   <int>      <dbl>
##  1 chrM      229      46.8 
##  2 chr12      47       9.61
##  3 chr16      41       8.38
##  4 chr1       34       6.95
##  5 chr11      28       5.73
##  6 chr2       19       3.89
##  7 chr7       16       3.27
##  8 chr19      14       2.86
##  9 chr5        9       1.84
## 10 chr15       6       1.23
## # ℹ 14 more rows
methylated_positions_center_C_groupedby_chr <- 
  methylated_positions_center_C |> 
  group_by(seqname) |> 
  calc_percentage()
methylated_positions_center_C_groupedby_chr
## # A tibble: 24 × 3
##    seqname     n percentage
##    <chr>   <int>      <dbl>
##  1 chrM      229      46.8 
##  2 chr12      47       9.61
##  3 chr16      41       8.38
##  4 chr1       34       6.95
##  5 chr11      28       5.73
##  6 chr2       19       3.89
##  7 chr7       16       3.27
##  8 chr19      14       2.86
##  9 chr5        9       1.84
## 10 chr15       6       1.23
## # ℹ 14 more rows
methylated_positions_C_groupedby_chr <- 
  methylated_positions_C |> 
  group_by(seqname) |> 
  calc_percentage()
methylated_positions_C_groupedby_chr
## # A tibble: 24 × 3
##    seqname     n percentage
##    <chr>   <int>      <dbl>
##  1 chrM      229      46.8 
##  2 chr12      47       9.61
##  3 chr16      41       8.38
##  4 chr1       34       6.95
##  5 chr11      28       5.73
##  6 chr2       19       3.89
##  7 chr7       16       3.27
##  8 chr19      14       2.86
##  9 chr5        9       1.84
## 10 chr15       6       1.23
## # ℹ 14 more rows

Percentage of mitochondrial RNAs

calc_percentage_chrM <- function(df) {
  
  df |> 
    group_by(seqname == 'chrM') |> 
    calc_percentage() |> 
    ungroup()  |> 
    dplyr::rename(isChrM = `seqname == "chrM"`)
  
}

methylated_positions_groupedby_chrMornot <- 
  methylated_positions |> 
  calc_percentage_chrM()
methylated_positions_groupedby_chrMornot
## # A tibble: 2 × 3
##   isChrM     n percentage
##   <lgl>  <int>      <dbl>
## 1 FALSE    260       53.2
## 2 TRUE     229       46.8
methylated_positions_center_C_groupedby_chrMornot <- 
  methylated_positions_center_C |> 
  calc_percentage_chrM()
methylated_positions_center_C_groupedby_chrMornot
## # A tibble: 2 × 3
##   isChrM     n percentage
##   <lgl>  <int>      <dbl>
## 1 FALSE    260       53.2
## 2 TRUE     229       46.8
methylated_positions_C_groupedby_chrMornot <- 
  methylated_positions_C |> 
  calc_percentage_chrM()
methylated_positions_C_groupedby_chrMornot
## # A tibble: 2 × 3
##   isChrM     n percentage
##   <lgl>  <int>      <dbl>
## 1 FALSE    260       53.2
## 2 TRUE     229       46.8

Plot

add_yrange <- function(df) {
  
  new_df <-  df |> 
    mutate(ymax = cumsum(percentage / 100))
  new_df$ymin <- c(0, head(new_df$ymax, n = -1))
  return(new_df)
  
}

donutplot_chrM <- function(df) {
  
  df |> 
    add_yrange() |> 
    ggplot(aes(
      xmin = 2, xmax = 4, ymin = ymin, ymax = ymax,
      fill = isChrM, colour = isChrM         
    )) +
    geom_rect() +
    coord_polar(theta = 'y') +  
    ggrepel::geom_text_repel(
      aes(label = isChrM, y = (ymin + ymax) / 2), x = 1
    ) +
    xlim(c(-1,4)) +
    scale_fill_manual(values = c('blue', 'red')) +
    scale_color_manual(values = c('blue', 'red')) +
    theme_void() 
  
}


methylated_positions_groupedby_chrMornot_donutplot <- 
  methylated_positions_groupedby_chrMornot |> 
  donutplot_chrM()
methylated_positions_groupedby_chrMornot_donutplot |> 
  ggsave_multiple_formats(
    width = 5, height = 5, fontsize = 7, outdir = figdir)

methylated_positions_C_groupedby_chrMornot_donutplot <- 
  methylated_positions_C_groupedby_chrMornot |> 
  donutplot_chrM()
methylated_positions_C_groupedby_chrMornot_donutplot |> 
  ggsave_multiple_formats(
    width = 5, height = 5, fontsize = 7, outdir = figdir)

methylated_positions_center_C_groupedby_chrMornot_donutplot <- 
  methylated_positions_center_C_groupedby_chrMornot |> 
  donutplot_chrM()
methylated_positions_center_C_groupedby_chrMornot_donutplot |> 
  ggsave_multiple_formats(
    width = 5, height = 5, fontsize = 7, outdir = figdir)

methylated_positions_groupedby_chr |> 
  ggplot(aes(x = reorder(seqname, n), y = n)) +
  geom_bar(stat = 'identity') +
  coord_flip()

% of transcripts with m3C sites in each chromosome

num_detected_transcripts_in_chromosomes <-
  espresso_annotation |>
  select(seqname, transcript_id) |>
  distinct() |>
  group_by(seqname) |>
  reframe(num_detected_transcripts_in_chr = n()) |>
  arrange(-num_detected_transcripts_in_chr)
num_detected_transcripts_in_chromosomes
## # A tibble: 69 × 2
##    seqname num_detected_transcripts_in_chr
##    <chr>                             <int>
##  1 chr1                               3605
##  2 chr2                               2776
##  3 chr11                              2422
##  4 chr17                              2237
##  5 chr19                              2236
##  6 chr7                               2146
##  7 chr3                               2121
##  8 chr12                              2105
##  9 chr16                              1906
## 10 chr5                               1733
## # ℹ 59 more rows
num_sites_in_transcripts <- 
  methylated_positions |> 
  group_by(
    seqname, transcript_id, transcript_name, gene_name, gene_type, transcript_type
  ) |> 
  reframe(num_sites_in_tr = n()) |> 
  arrange(-num_sites_in_tr)
num_sites_in_transcripts |> 
  export_tsv(outdir = tabledir)
## 
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Tables/DRS_m3C_sites/num_sites_in_transcripts_2024-07-29.tsv
## # A tibble: 71 × 7
##    seqname transcript_id     transcript_name gene_name gene_type transcript_type
##    <chr>   <chr>             <chr>           <chr>     <chr>     <chr>          
##  1 chr16   ENST00000343262.9 RPS2-201        RPS2      protein_… protein_coding 
##  2 chrM    ENST00000361789.2 MT-CYB-201      MT-CYB    protein_… protein_coding 
##  3 chrM    ENST00000389680.2 MT-RNR1-201     MT-RNR1   Mt_rRNA   Mt_rRNA        
##  4 chrM    ENST00000361453.3 MT-ND2-201      MT-ND2    protein_… protein_coding 
##  5 chrM    ENST00000361381.2 MT-ND4-201      MT-ND4    protein_… protein_coding 
##  6 chrM    ENST00000361624.2 MT-CO1-201      MT-CO1    protein_… protein_coding 
##  7 chr11   ENST00000273550.… FTH1-201        FTH1      protein_… protein_coding 
##  8 chr12   ENST00000392514.9 RPLP0-203       RPLP0     protein_… protein_coding 
##  9 chrM    ENST00000361739.1 MT-CO2-201      MT-CO2    protein_… protein_coding 
## 10 chrM    ENST00000361390.2 MT-ND1-201      MT-ND1    protein_… protein_coding 
## # ℹ 61 more rows
## # ℹ 1 more variable: num_sites_in_tr <int>
num_transcripts_with_m3Csites_groupedby_chr <- 
  num_sites_in_transcripts |> 
  group_by(seqname) |> 
  reframe(n = n()) |> 
  arrange(-n)
num_transcripts_with_m3Csites_groupedby_chr 
## # A tibble: 24 × 2
##    seqname     n
##    <chr>   <int>
##  1 chrM       11
##  2 chr12       9
##  3 chr1        7
##  4 chr11       7
##  5 chr5        4
##  6 chr19       3
##  7 chr2        3
##  8 chr7        3
##  9 chr8        3
## 10 chr14       2
## # ℹ 14 more rows
percent_m3CRNAs_in_chr <- 
  num_transcripts_with_m3Csites_groupedby_chr |> 
  filter(grepl('chr', seqname)) |> 
  left_join(num_detected_transcripts_in_chromosomes) |> 
  mutate(percent_m3CRNAs_in_chr = 100 *  n / num_detected_transcripts_in_chr) |> 
  arrange(-percent_m3CRNAs_in_chr)
## Joining with `by = join_by(seqname)`
percent_m3CRNAs_in_chr |> 
  export_tsv(outdir = tabledir)
## 
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Tables/DRS_m3C_sites/percent_m3CRNAs_in_chr_2024-07-29.tsv
## # A tibble: 23 × 4
##    seqname     n num_detected_transcripts_in_chr percent_m3CRNAs_in_chr
##    <chr>   <int>                           <int>                  <dbl>
##  1 chrM       11                              23                 47.8  
##  2 chr12       9                            2105                  0.428
##  3 chr11       7                            2422                  0.289
##  4 chr5        4                            1733                  0.231
##  5 chrX        2                             913                  0.219
##  6 chr8        3                            1408                  0.213
##  7 chr1        7                            3605                  0.194
##  8 chr18       1                             525                  0.190
##  9 chr20       2                            1054                  0.190
## 10 chr13       1                             586                  0.171
## # ℹ 13 more rows
percent_m3CRNAs_in_chr_barplot <- 
  percent_m3CRNAs_in_chr |> 
  ggplot(aes(
    x = reorder(seqname, percent_m3CRNAs_in_chr),
    y = percent_m3CRNAs_in_chr)) +
  geom_bar(stat = 'identity') +
  coord_flip() +
  labs(x = '', y = '% of transcripts\nwith m3C sites') 
percent_m3CRNAs_in_chr_barplot |> 
  ggsave_multiple_formats(
    width = 4, height = 6, fontsize = 7, outdir = figdir)

Distribution of number or percentage of m3C in each transcripts

num_sites_in_transcripts |> 
  ggplot(aes(x = reorder(seqname, num_sites_in_tr), y = num_sites_in_tr)) +
  geom_point() +
  coord_flip()

num_transcripts_groupedby_genetype <- 
  num_sites_in_transcripts |> 
  mutate(
    genetype2 = case_when(
      gene_type == 'protein_coding' & seqname == 'chrM' ~ 'mt-mRNA',
      gene_type == 'protein_coding' & seqname != 'chrM' ~ 'mRNA',
      gene_type != 'protein_coding' & seqname == 'chrM' ~ 'mt-rRNA',
      is.na(gene_type) ~ 'unannotated gene'
    )
  ) |> 
  group_by(genetype2) |> 
  calc_percentage() |> 
  add_yrange()
num_transcripts_groupedby_genetype |> 
  export_tsv(outdir = tabledir)
## 
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Tables/DRS_m3C_sites/num_transcripts_groupedby_genetype_2024-07-29.tsv
## # A tibble: 4 × 5
##   genetype2            n percentage  ymax  ymin
##   <chr>            <int>      <dbl> <dbl> <dbl>
## 1 mRNA                59      83.1  0.831 0    
## 2 mt-mRNA              9      12.7  0.958 0.831
## 3 mt-rRNA              2       2.82 0.986 0.958
## 4 unannotated gene     1       1.41 1     0.986
num_transcripts_groupedby_genetype_donut <- 
  num_transcripts_groupedby_genetype |> 
  donutplot_genetype()
num_transcripts_groupedby_genetype_donut |> 
  ggsave_multiple_formats(
    width = 5, height = 5, fontsize = 7, outdir = figdir
  )

positions grouped by transcript type

num_m3Csites_groupedby_genetype <- 
  num_sites_in_transcripts |> 
  mutate(
    genetype2 = case_when(
      gene_type == 'protein_coding' & seqname == 'chrM' ~ 'mt-mRNA',
      gene_type == 'protein_coding' & seqname != 'chrM' ~ 'mRNA',
      gene_type != 'protein_coding' & seqname == 'chrM' ~ 'mt-rRNA',
      is.na(gene_type) ~ 'unannotated gene'
    )
  ) |> 
  group_by(genetype2) |> 
  reframe(num_m3Csite = sum(num_sites_in_tr)) |> 
  mutate(percentage = 100 * num_m3Csite / sum(num_m3Csite))
num_m3Csites_groupedby_genetype |> 
  export_tsv(outdir = tabledir)
## 
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Tables/DRS_m3C_sites/num_m3Csites_groupedby_genetype_2024-07-29.tsv
## # A tibble: 4 × 3
##   genetype2        num_m3Csite percentage
##   <chr>                  <int>      <dbl>
## 1 mRNA                     257     52.6  
## 2 mt-mRNA                  182     37.2  
## 3 mt-rRNA                   47      9.61 
## 4 unannotated gene           3      0.613
num_m3Csites_groupedby_genetype_donut <- 
  num_m3Csites_groupedby_genetype |> 
  donutplot_genetype()
num_m3Csites_groupedby_genetype_donut

num_m3Csites_groupedby_genetype_donut |> 
  ggsave_multiple_formats(
    width = 5, height = 5, fontsize = 7, outdir = figdir
  )